In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [4]:
def get_numpy_data(dataset, features, output_name):
    dataset['constant'] = 1
    return dataset[['constant'] + features].values, dataset[output_name].values.reshape(
            (len(dataset[output_name].values), 1))

In [5]:
def predict_output(feature_matrix, weights):
    return feature_matrix.dot(weights)

In [33]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    return features/norms, norms.reshape((len(norms), 1))

In [7]:
def compute_roi(i, feature_matrix, output, prediction, weights):
    return sum(feature_matrix[:, i:i + 1] * (output - prediction + weights[i, 0] * feature_matrix[:, i:i + 1]))

In [8]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    prediction = predict_output(feature_matrix, weights)
    ro_i = compute_roi(i, feature_matrix, output, prediction, weights)
    # print('Ro_%s: %s, l1_penalty / 2.: %s' % (i, ro_i, l1_penalty / 2.))
    if i == 0:
        new_weight_i = ro_i
    elif ro_i < -l1_penalty / 2.:
        new_weight_i = ro_i + l1_penalty / 2.
    elif ro_i > l1_penalty / 2.:
        new_weight_i = ro_i - l1_penalty / 2.
    else:
        new_weight_i = 0
    return new_weight_i

In [9]:
# should print 0.425558846691
import math
print(lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]).reshape((2,1)), np.array([1., 4.]).reshape((2,1)), 0.1))


[ 0.42555885]

Effect of L1 penalty


In [10]:
initial_weights = np.array([1, 4, 1]).reshape((3,1))
features_matrix, output = get_numpy_data(sales, ['sqft_living', 'bedrooms'], 'price')
features_matrix_normalized, norm = normalize_features(features_matrix)

In [11]:
prediction = predict_output(features_matrix_normalized, initial_weights)
ro_0 = compute_roi(0, features_matrix_normalized, output, prediction, initial_weights)
print(ro_0)
ro_1 = compute_roi(1, features_matrix_normalized, output, prediction, initial_weights)
print(ro_1)
ro_2 = compute_roi(2, features_matrix_normalized, output, prediction, initial_weights)
print(ro_2)


[ 79400300.01452321]
[ 87939470.82325152]
[ 80966698.66623905]

In [23]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    weights = initial_weights
    change_magnitude = np.zeros((len(weights), 1))
    converged = False
    cycle_count = 0
    while not converged:
        # new_weights = weights
        for i in range(len(weights)):
            weights_i = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            change_magnitude[i] = abs(weights[i, 0] - weights_i)
            # print('Change magnitude for %s: %s' % (i, change_magnitude[i]))
            # new_weights[i] = weights_i
            weights[i, 0] = weights_i
        # weights = new_weights
        magnitude_ = sum(change_magnitude)
        # print('Weights: %s' % weights)
        cycle_count += 1
        if cycle_count % 5 == 0:
            print('%s cycles passed. Magnitude: %s' % (cycle_count, magnitude_))
        if magnitude_ < tolerance or cycle_count > 1000:
            converged = True
    return weights

In [13]:
features_matrix, output = get_numpy_data(sales, ['sqft_living', 'bedrooms'], 'price')
normalized_feature_matrix, norm = normalize_features(features_matrix)
print(norm)
print(normalized_feature_matrix)
l1_penalty = 1e7
tolerance = 1
lasso_weights = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, np.array([0, 0, 0]).reshape((3,1)), l1_penalty, tolerance)
print(lasso_weights)


[  1.47013605e+02   3.34257264e+05   5.14075870e+02]
[[ 0.00680209  0.00353021  0.00583571]
 [ 0.00680209  0.00768869  0.00583571]
 [ 0.00680209  0.00230361  0.00389048]
 ..., 
 [ 0.00680209  0.00305154  0.00389048]
 [ 0.00680209  0.00478673  0.00583571]
 [ 0.00680209  0.00305154  0.00389048]]
5 cycles passed. Magnitude: [ 10573774.65507453]
10 cycles passed. Magnitude: [ 4339310.74125477]
15 cycles passed. Magnitude: [ 1780784.82376609]
20 cycles passed. Magnitude: [ 730805.26826212]
25 cycles passed. Magnitude: [ 299911.13064792]
30 cycles passed. Magnitude: [ 123079.74001583]
35 cycles passed. Magnitude: [ 50509.79389759]
40 cycles passed. Magnitude: [ 20728.7994532]
45 cycles passed. Magnitude: [ 8506.04351887]
50 cycles passed. Magnitude: [ 3490.71473436]
55 cycles passed. Magnitude: [ 1432.33454936]
60 cycles passed. Magnitude: [ 587.84190048]
65 cycles passed. Magnitude: [ 241.1840947]
70 cycles passed. Magnitude: [ 98.35301779]
75 cycles passed. Magnitude: [ 40.57295382]
80 cycles passed. Magnitude: [ 16.81329484]
85 cycles passed. Magnitude: [ 6.29690337]
90 cycles passed. Magnitude: [ 2.33696122]
95 cycles passed. Magnitude: [ 1.48481113]
100 cycles passed. Magnitude: [ 1.51604535]
[[21624995]
 [63157249]
 [       0]]

In [14]:
errors = predict_output(normalized_feature_matrix, lasso_weights)
rss = errors.T.dot(errors)
print('RSS: %s' % rss)


RSS: [[  6.95526005e+15]]

Evaluate coordinate descent on larger set of features


In [15]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [34]:
all_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
train_features_matrix, train_output = get_numpy_data(train_data, all_features, 'price')
train_features_matrix_normalized, train_norm = normalize_features(train_features_matrix)

In [35]:
l1_penalty = 1e7
tolerance = 1
weights1e7 = lasso_cyclical_coordinate_descent(train_features_matrix_normalized, train_output,
                                               np.zeros((len(all_features) + 1, 1)),
                                               l1_penalty,
                                               tolerance)
print(weights1e7)


5 cycles passed. Magnitude: [ 11859917.18352854]
10 cycles passed. Magnitude: [ 5583360.66817527]
15 cycles passed. Magnitude: [ 1057396.13849319]
20 cycles passed. Magnitude: [ 446572.45915572]
25 cycles passed. Magnitude: [ 190050.53755865]
30 cycles passed. Magnitude: [ 80882.64846321]
35 cycles passed. Magnitude: [ 34422.43854406]
40 cycles passed. Magnitude: [ 14649.67206017]
45 cycles passed. Magnitude: [ 6234.68006782]
50 cycles passed. Magnitude: [ 2653.3860546]
55 cycles passed. Magnitude: [ 1129.24119268]
60 cycles passed. Magnitude: [ 480.58806567]
65 cycles passed. Magnitude: [ 204.53105217]
70 cycles passed. Magnitude: [ 87.0453393]
75 cycles passed. Magnitude: [ 37.04518678]
80 cycles passed. Magnitude: [ 15.76587475]
85 cycles passed. Magnitude: [ 6.70971866]
90 cycles passed. Magnitude: [ 2.85555523]
95 cycles passed. Magnitude: [ 1.21528147]
[[ 24429597.96727435]
 [        0.        ]
 [        0.        ]
 [ 48389176.90646759]
 [        0.        ]
 [        0.        ]
 [  3317511.2178916 ]
 [  7329961.67180582]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]]

In [20]:
l1_penalty=1e8
weights1e8 = lasso_cyclical_coordinate_descent(train_features_matrix_normalized, train_output,
                                               np.zeros((len(all_features) + 1, 1)),
                                               l1_penalty,
                                               tolerance)
print(weights1e8)


[[ 71114625.71488713]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]
 [        0.        ]]

In [24]:
l1_penalty=1e4
weights1e4 = lasso_cyclical_coordinate_descent(train_features_matrix_normalized, train_output,
                                               np.zeros((len(all_features) + 1, 1)),
                                               l1_penalty,
                                               tolerance)
print(weights1e4)


5 cycles passed. Magnitude: [ 18077504.27530563]
10 cycles passed. Magnitude: [ 6371784.58255979]
15 cycles passed. Magnitude: [ 5426454.7063846]
20 cycles passed. Magnitude: [ 5142779.24737815]
25 cycles passed. Magnitude: [ 4963747.49404388]
30 cycles passed. Magnitude: [ 4595972.09881297]
35 cycles passed. Magnitude: [ 4167302.12166832]
40 cycles passed. Magnitude: [ 3748617.7136577]
45 cycles passed. Magnitude: [ 3401393.98532931]
50 cycles passed. Magnitude: [ 3218626.93722823]
55 cycles passed. Magnitude: [ 3048068.4342303]
60 cycles passed. Magnitude: [ 2895417.09009566]
65 cycles passed. Magnitude: [ 2759494.80573087]
70 cycles passed. Magnitude: [ 2636132.08227795]
75 cycles passed. Magnitude: [ 2524647.69130691]
80 cycles passed. Magnitude: [ 2424944.59909478]
85 cycles passed. Magnitude: [ 2336518.50397539]
90 cycles passed. Magnitude: [ 2257073.27017595]
95 cycles passed. Magnitude: [ 2190258.96046202]
100 cycles passed. Magnitude: [ 2135935.96345648]
105 cycles passed. Magnitude: [ 2084294.04467025]
110 cycles passed. Magnitude: [ 2035209.55168389]
115 cycles passed. Magnitude: [ 1988515.72553825]
120 cycles passed. Magnitude: [ 1944025.20649928]
125 cycles passed. Magnitude: [ 1901545.27077399]
130 cycles passed. Magnitude: [ 1860887.83578987]
135 cycles passed. Magnitude: [ 1821875.72000974]
140 cycles passed. Magnitude: [ 1784346.23869701]
145 cycles passed. Magnitude: [ 1748152.92669231]
150 cycles passed. Magnitude: [ 1713165.96974641]
155 cycles passed. Magnitude: [ 1679271.77270914]
160 cycles passed. Magnitude: [ 1646371.97891942]
165 cycles passed. Magnitude: [ 1614382.16944214]
170 cycles passed. Magnitude: [ 1583230.40605071]
175 cycles passed. Magnitude: [ 1552855.73264584]
180 cycles passed. Magnitude: [ 1523206.71287511]
185 cycles passed. Magnitude: [ 1494240.05396056]
190 cycles passed. Magnitude: [ 1465919.34638821]
195 cycles passed. Magnitude: [ 1438213.93428008]
200 cycles passed. Magnitude: [ 1411097.92081927]
205 cycles passed. Magnitude: [ 1384549.30599547]
210 cycles passed. Magnitude: [ 1358549.2492327]
215 cycles passed. Magnitude: [ 1333081.44673792]
220 cycles passed. Magnitude: [ 1308131.61189223]
225 cycles passed. Magnitude: [ 1283687.04664613]
230 cycles passed. Magnitude: [ 1259736.29193454]
235 cycles passed. Magnitude: [ 1236268.84586933]
240 cycles passed. Magnitude: [ 1213274.93928161]
245 cycles passed. Magnitude: [ 1190745.35921233]
250 cycles passed. Magnitude: [ 1168671.31212184]
255 cycles passed. Magnitude: [ 1147044.31947692]
260 cycles passed. Magnitude: [ 1125856.13959952]
265 cycles passed. Magnitude: [ 1105098.71041835]
270 cycles passed. Magnitude: [ 1084764.10876409]
275 cycles passed. Magnitude: [ 1064844.52246694]
280 cycles passed. Magnitude: [ 1045332.2322878]
285 cycles passed. Magnitude: [ 1026219.60124242]
290 cycles passed. Magnitude: [ 1007499.06930435]
295 cycles passed. Magnitude: [ 989163.1519978]
300 cycles passed. Magnitude: [ 971204.44160467]
305 cycles passed. Magnitude: [ 953615.61010564]
310 cycles passed. Magnitude: [ 936389.41308603]
315 cycles passed. Magnitude: [ 919518.69415279]
320 cycles passed. Magnitude: [ 902996.38938609]
325 cycles passed. Magnitude: [ 886815.5316676]
330 cycles passed. Magnitude: [ 870969.25461148]
335 cycles passed. Magnitude: [ 855450.79605268]
340 cycles passed. Magnitude: [ 840253.50101456]
345 cycles passed. Magnitude: [ 825370.824117]
350 cycles passed. Magnitude: [ 810796.33145863]
355 cycles passed. Magnitude: [ 796523.70197935]
360 cycles passed. Magnitude: [ 782546.72834434]
365 cycles passed. Magnitude: [ 768859.31738664]
370 cycles passed. Magnitude: [ 755455.49016937]
375 cycles passed. Magnitude: [ 742329.38169811]
380 cycles passed. Magnitude: [ 729475.24031108]
385 cycles passed. Magnitude: [ 716887.42685278]
390 cycles passed. Magnitude: [ 704560.4136005]
395 cycles passed. Magnitude: [ 692488.78301264]
400 cycles passed. Magnitude: [ 680667.22633795]
405 cycles passed. Magnitude: [ 669090.54209439]
410 cycles passed. Magnitude: [ 657753.6344605]
415 cycles passed. Magnitude: [ 646651.51157418]
420 cycles passed. Magnitude: [ 635779.28381495]
425 cycles passed. Magnitude: [ 625132.16199923]
430 cycles passed. Magnitude: [ 614705.45558152]
435 cycles passed. Magnitude: [ 604494.57083596]
440 cycles passed. Magnitude: [ 594495.00901769]
445 cycles passed. Magnitude: [ 584702.36455201]
450 cycles passed. Magnitude: [ 575112.3232139]
455 cycles passed. Magnitude: [ 565720.66032029]
460 cycles passed. Magnitude: [ 556523.23894995]
465 cycles passed. Magnitude: [ 547516.0081885]
470 cycles passed. Magnitude: [ 538695.00137949]
475 cycles passed. Magnitude: [ 530056.33442009]
480 cycles passed. Magnitude: [ 521596.2040704]
485 cycles passed. Magnitude: [ 513310.88629588]
490 cycles passed. Magnitude: [ 505196.73464908]
495 cycles passed. Magnitude: [ 497250.17866011]
500 cycles passed. Magnitude: [ 489467.7222797]
505 cycles passed. Magnitude: [ 481845.94233205]
510 cycles passed. Magnitude: [ 474381.48700496]
515 cycles passed. Magnitude: [ 467071.07437358]
520 cycles passed. Magnitude: [ 459911.49095843]
525 cycles passed. Magnitude: [ 452904.31635183]
530 cycles passed. Magnitude: [ 446048.12900327]
535 cycles passed. Magnitude: [ 439333.28879217]
540 cycles passed. Magnitude: [ 432756.8470397]
545 cycles passed. Magnitude: [ 425885.10087065]
550 cycles passed. Magnitude: [ 419320.16989332]
555 cycles passed. Magnitude: [ 412725.53424221]
560 cycles passed. Magnitude: [ 406374.31754538]
565 cycles passed. Magnitude: [ 400229.301218]
570 cycles passed. Magnitude: [ 394262.68614701]
575 cycles passed. Magnitude: [ 388453.66488366]
580 cycles passed. Magnitude: [ 382786.61077837]
585 cycles passed. Magnitude: [ 377249.76998519]
590 cycles passed. Magnitude: [ 371834.27922332]
595 cycles passed. Magnitude: [ 366602.62456315]
600 cycles passed. Magnitude: [ 361539.47213243]
605 cycles passed. Magnitude: [ 356643.05777298]
610 cycles passed. Magnitude: [ 351864.46732221]
615 cycles passed. Magnitude: [ 347175.34112466]
620 cycles passed. Magnitude: [ 342562.51279337]
625 cycles passed. Magnitude: [ 338021.66084347]
630 cycles passed. Magnitude: [ 333552.90236251]
635 cycles passed. Magnitude: [ 329158.16651828]
640 cycles passed. Magnitude: [ 324839.78454851]
645 cycles passed. Magnitude: [ 320599.81161401]
650 cycles passed. Magnitude: [ 316439.75465922]
655 cycles passed. Magnitude: [ 312360.51012164]
660 cycles passed. Magnitude: [ 308362.40131303]
665 cycles passed. Magnitude: [ 304445.25647027]
670 cycles passed. Magnitude: [ 300608.49767993]
675 cycles passed. Magnitude: [ 296851.22632184]
680 cycles passed. Magnitude: [ 293172.29907716]
685 cycles passed. Magnitude: [ 289570.39256693]
690 cycles passed. Magnitude: [ 286044.05675674]
695 cycles passed. Magnitude: [ 282591.75804993]
700 cycles passed. Magnitude: [ 279211.91350034]
705 cycles passed. Magnitude: [ 275902.91738165]
710 cycles passed. Magnitude: [ 272663.1614833]
715 cycles passed. Magnitude: [ 269491.05025099]
720 cycles passed. Magnitude: [ 266385.01177449]
725 cycles passed. Magnitude: [ 263343.50550663]
730 cycles passed. Magnitude: [ 260365.02733283]
735 cycles passed. Magnitude: [ 257448.11270929]
740 cycles passed. Magnitude: [ 254591.33826081]
745 cycles passed. Magnitude: [ 251793.32225572]
750 cycles passed. Magnitude: [ 249052.72425482]
755 cycles passed. Magnitude: [ 246368.24420074]
760 cycles passed. Magnitude: [ 243738.62113648]
765 cycles passed. Magnitude: [ 241162.63167234]
770 cycles passed. Magnitude: [ 238639.08833058]
775 cycles passed. Magnitude: [ 236166.83785025]
780 cycles passed. Magnitude: [ 233744.75950055]
785 cycles passed. Magnitude: [ 231371.76345506]
790 cycles passed. Magnitude: [ 229046.78922382]
795 cycles passed. Magnitude: [ 226768.80420703]
800 cycles passed. Magnitude: [ 224536.80232776]
805 cycles passed. Magnitude: [ 222349.80279138]
810 cycles passed. Magnitude: [ 220206.84891585]
815 cycles passed. Magnitude: [ 218107.00708189]
820 cycles passed. Magnitude: [ 216049.36579226]
825 cycles passed. Magnitude: [ 214033.03475741]
830 cycles passed. Magnitude: [ 212057.14412336]
835 cycles passed. Magnitude: [ 210120.843716]
840 cycles passed. Magnitude: [ 208223.30238169]
845 cycles passed. Magnitude: [ 206363.70737653]
850 cycles passed. Magnitude: [ 204541.26378226]
855 cycles passed. Magnitude: [ 202755.19398289]
860 cycles passed. Magnitude: [ 201004.73717925]
865 cycles passed. Magnitude: [ 199289.14892486]
870 cycles passed. Magnitude: [ 197607.70070859]
875 cycles passed. Magnitude: [ 195959.67953637]
880 cycles passed. Magnitude: [ 194344.38754952]
885 cycles passed. Magnitude: [ 192761.14167435]
890 cycles passed. Magnitude: [ 191209.27324361]
895 cycles passed. Magnitude: [ 189688.1277083]
900 cycles passed. Magnitude: [ 187532.97519427]
905 cycles passed. Magnitude: [ 186155.53728232]
910 cycles passed. Magnitude: [ 184791.80952353]
915 cycles passed. Magnitude: [ 183450.59174092]
920 cycles passed. Magnitude: [ 182134.84333255]
925 cycles passed. Magnitude: [ 180845.28957457]
930 cycles passed. Magnitude: [ 179581.78688884]
935 cycles passed. Magnitude: [ 178343.87824957]
940 cycles passed. Magnitude: [ 177131.01094234]
945 cycles passed. Magnitude: [ 175942.61693701]
950 cycles passed. Magnitude: [ 174778.13958494]
955 cycles passed. Magnitude: [ 173637.04041023]
960 cycles passed. Magnitude: [ 172518.79925091]
965 cycles passed. Magnitude: [ 171770.47234096]
970 cycles passed. Magnitude: [ 171125.52633565]
975 cycles passed. Magnitude: [ 170493.04052191]
980 cycles passed. Magnitude: [ 169872.66037468]
985 cycles passed. Magnitude: [ 169264.0573548]
990 cycles passed. Magnitude: [ 168676.2532717]
995 cycles passed. Magnitude: [ 168201.66314768]
1000 cycles passed. Magnitude: [ 167719.32156848]
[[  1.47465697e+08]
 [ -1.79251810e+07]
 [  8.68932901e+02]
 [  1.88519800e+08]
 [ -1.59866712e+06]
 [  4.95360966e+05]
 [  6.76772376e+06]
 [  5.44624529e+06]
 [  2.10554820e+07]
 [  1.13254584e+08]
 [ -1.14835732e+08]
 [ -2.94131055e+07]
 [ -2.46476790e+08]
 [  2.98585891e+06]]

In [38]:
normalized_weights1e7 = weights1e7/train_norm
normalized_weights1e4 = weights1e4/train_norm
normalized_weights1e8 = weights1e8/train_norm

In [41]:
test_feature_matrix, test_output = get_numpy_data(test_data, all_features, 'price')
prediction1e7 = predict_output(test_feature_matrix, normalized_weights1e7)
prediction1e4 = predict_output(test_feature_matrix, normalized_weights1e4)
prediction1e8 = predict_output(test_feature_matrix, normalized_weights1e8)

In [42]:
error1e7 = test_output - prediction1e7
error1e4 = test_output - prediction1e4
error1e8 = test_output - prediction1e8

In [43]:
RSS1e7 = error1e7.T.dot(error1e7)
RSS1e4 = error1e7.T.dot(error1e4)
RSS1e8 = error1e7.T.dot(error1e8)

In [45]:
RSS = [RSS1e4, RSS1e7, RSS1e8]

In [46]:
min(RSS)


Out[46]:
array([[  2.00103066e+14]])

In [47]:
RSS.index(min(RSS))


Out[47]:
0

In [48]:
RSS


Out[48]:
[array([[  2.00103066e+14]]),
 array([[  2.75962072e+14]]),
 array([[  3.41122091e+14]])]

In [ ]: